
PoincareMSA builds a projection of protein multiple sequence alignemnt (MSA) on a Poincaré disk. The proximity of the points to the disk center corresponds to their hierarchy and correlates with the proximity of the proteins to the root of the phylogenetic tree. Thus, must central point often correspond to the ancestor proteins and protein located close to the border to the leaves of phylogenetic tree.
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
#Import visualization functions
from scripts.visualize_projection.pplots_new import read_embeddings, plot_embedding, plot_embedding_interactive, rotate, get_colors
%matplotlib inline
#Create optional variables
path_annotation = ""
# OPTIONS =================================================
mfasta = "new_examples/kinases/kinases.mfasta"
path_annotation = "new_examples/kinases/kinase_group_new.csv"
#==========================================================
#Check files
#mfasta
nb_seq = 0
if os.path.isfile(mfasta):
with open(mfasta, "r") as f_in:
for line in f_in:
if line[0] == ">":
nb_seq += 1
print(f"\nNumber of sequences found: {nb_seq}.")
else:
print(f"File {mfasta} not found.")
#Annotations
if os.path.isfile(path_annotation):
try:
df_annotation = pd.read_csv(path_annotation)
if len(df_annotation) != nb_seq:
raise ValueError("Annotation file doesn't match the .mfasta file length.")
except:
raise ValueError("Annotation file is not in .csv format.")
print("\nAnnotation file correctly loaded.")
annotation_names = list(df_annotation.columns)
print(f"{len(annotation_names)} annotations found: {annotation_names}.")
else:
print(f"File {path_annotation} not found")
Number of sequences found: 497. Annotation file correctly loaded. 18 annotations found: ['proteins_id', '1_Group', '2_Gene', '3_HGNC', '4_Uni_entry', '5_Uni_acc', '6_Domain_begin', '7_Domain_end', '8_Domain_length', '9_Largest_insert_length', '10_PDB_validation', '11_Conformational_state', '12_Dihedral_state', '13_Group_in_Uni', '14_Group_in_Manning', '15_Synonymn', 'evo_distance', 'decile_domain'].
# OPTIONS =================================================
# Job name
out_name = "kinases_data"
#----------------------------------------------------------
# Threshold for filtering gapped positions
gapth = 0.9
#----------------------------------------------------------
# Projection parameters
knn = 5
gamma = 2
sigma = 1
cospca = 0
batchs = 4
epochs = 1000
seed = 0
#==========================================================
# 1. Data preparation
# Data preparation consists in `.mfasta` cleaning according to a gap threshold and translation of each sequence to the PSSM profile.
print("1. Data preparation")
prep_parameters = "scripts/prepare_data" + " " + mfasta + " " + out_name + " " + out_name + " " + str(gapth)
bash_projection = "bash scripts/prepare_data/create_projection.sh " + prep_parameters
!{bash_projection}
print("\n2. Data projection using Poincaré disk")
# 2. Data projection using Poincaré disk
#This step creates a projection of encoded sequences to a Poincaré disk.
bash_pm = "python3 "+ "scripts/build_poincare_map/main.py --input_path " + out_name + "/fasta" + str(gapth) + " --output_path " + out_name + "/projections/ --gamma "+ str(gamma) +" --pca "+ str(cospca) + " --epochs "+ str(epochs) +" --seed "+ str(seed) + " --knn " + str(knn)
!{bash_pm}
print("\n3. Format data for visualization")
# 3. Format data for visualization
#Check that an annotation file was provided. Create a dummy one instead
if not path_annotation:
df_annotation = pd.DataFrame(np.full(nb_seq, "-", dtype=object), columns=["default"])
df_annotation.to_csv("dummy_annotation.csv", index=False)
path_annotation = "dummy_annotation.csv"
annotation_names = ["default"]
path_embedding = f"{out_name}/projections/PM{knn:1.0f}sigma={sigma:2.2f}gamma={gamma:2.2f}cosinepca={cospca:1.0f}_seed{seed:1.0f}.csv"
df_embedding = read_embeddings(path_embedding, path_annotation, withroot=False)
1. Data preparation Input file: new_examples/kinases/kinases.mfasta Name of the protein family: kinases 23923 X aa replaced by gaps in 497 sequences filter_gaps finished for new_examples/kinases/kinases.mfasta mfasta2fasta finished for kinases_data/kinases_data.clean0.9.mfasta 23923 X aa replaced by gaps in 497 sequences 2. Data projection using Poincaré disk CUDA: True 497 proteins found in folder kinases_data/fasta0.9. No root detected Prepare data: tensor construction Prepare data: successfully terminated Computing laplacian... Laplacian computed in 0.10 sec Computing RFA... RFA computed in 0.06 sec Starting training... loss: 0.60271: 100%|████████████████████████| 1000/1000 [16:19<00:00, 1.02it/s] PM computed in 979.56 sec loss = 6.027e-01 time = 16.329 min 3. Format data for visualization
# Construction of custom color palette
kinase_palette = {-1 : "#c7c7c7", "OTHER": "#c7c7c7", "None" :"#c7c7c7", "NA" : "#c7c7c7", "Uncharacterized" : "#c7c7c7", "root": "#000000",
"TYR": "#bd065f", "CMGC": "#d5c203", "TKL": "#997e73","STE": "#80b412", # kinase groups
"CK1": "#0dbae9", "AGC": "#00bba1", "CAMK": "#1f6ed4", "NEK": "#8ce4fa", "RGC":"#f59a62"}
# OPTIONS =================================================
title = "PM projection colored by kinase groups"
#----------------------------------------------------------
# Select the coloring from annotation .csv file:
labels_name = "1_Group"
# Select classes to label among the "labels_name" column (comma separated list):
labels_text = ""
show_text = False
#----------------------------------------------------------
# Use a custom color palette:
color_palette = kinase_palette
use_custom_palette = True
#==========================================================
#Check projection visualization parameters
#Labels name
if labels_name == "" and path_annotation == "dummy_annotation.csv":
labels_name = "default"
elif labels_name == "":
labels_name = None
elif labels_name not in annotation_names:
raise NameError(f"labels_name {labels_name} is not in the availables annotations.\nAvailables annotations: {annotation_names}")
#Labels text
try:
labels_text = [s.strip() for s in labels_text.split(",")]
except:
print("Error: label_text field is not a valid list.")
#Convert labels_text to labels_name dtype
if labels_name and labels_name != "default":
try:
labels_text_dtype = df_annotation[labels_name].dtypes
labels_text = list(np.array(labels_text).astype(labels_text_dtype))
except:
raise TypeError(f'"labels_text" is not compatible with {labels_name}" data format ({labels_text_dtype}).')
if not use_custom_palette:
color_palette = None
#Plot graph
fig = plot_embedding_interactive(df_embedding,
labels_name = labels_name,
show_text = show_text,
labels_text = labels_text,
color_palette = color_palette,
title = title,
fontsize = 11)
fig.show()
# OPTIONS =================================================
title = "PM projection on kinases by kinase groups in Uniprot"
#----------------------------------------------------------
# Select the coloring from annotation .csv file:
labels_name = "13_Group_in_Uni"
# Select classes to label among the "labels_name" column (comma separated list):
labels_text = ""
show_text = False
#----------------------------------------------------------
# Use a custom color palette:
color_palette = kinase_palette
use_custom_palette = True
#==========================================================
#Check projection visualization parameters
#Labels name
if labels_name == "" and path_annotation == "dummy_annotation.csv":
labels_name = "default"
elif labels_name == "":
labels_name = None
elif labels_name not in annotation_names:
raise NameError(f"labels_name {labels_name} is not in the availables annotations.\nAvailables annotations: {annotation_names}")
#Labels text
if labels_text:
try:
labels_text = [s.strip() for s in labels_text.split(",")]
except:
print("Error: label_text field is not a valid list.")
else:
labels_text = [""]
#Convert labels_text to labels_name dtype
if labels_name and labels_name != "default" and labels_text != [""]:
try:
labels_text_dtype = df_annotation[labels_name].dtypes
labels_text = list(np.array(labels_text).astype(labels_text_dtype))
except:
raise TypeError(f'"labels_text" is not compatible with {labels_name}" data format ({labels_text_dtype}).')
if not use_custom_palette:
color_palette = None
#Plot graph
fig = plot_embedding_interactive(df_embedding,
labels_name = labels_name,
show_text = show_text,
labels_text = labels_text,
color_palette = color_palette,
title = title,
fontsize = 11)
fig.show()
# OPTIONS =================================================
output_name = "fig1"
output_format = "png" #Format availables: ["png", "html", "pdf", "svg"]
#==========================================================
if output_format != "html":
fig.write_image(f"{output_name}.{output_format}", engine="kaleido")
else:
fig.write_html(f"{output_name}.{output_format}")